Get the data

suppressMessages({
  library(bigsnpr)
  library(tidyverse)
  library(foreach)
})

celiac <- snp_attach("backingfiles/celiacQC.rds")
G <- celiac$genotypes
CHR <- celiac$map$chromosome
POS <- celiac$map$physical.pos
NCORES <- parallel::detectCores() / 2

Without removing long-range LD regions

system.time(
  ind.keep <- snp_clumping(G, CHR, ncores = NCORES)
)
##    user  system elapsed 
##   4.156   0.537  88.764
system.time(
  obj.svd <- big_randomSVD(G, snp_scaleBinom(), 
                           ind.col = ind.keep,
                           ncores = NCORES)
)
##    user  system elapsed 
##   0.081   0.097  31.434
(plot_loadingsA <- plot(obj.svd, type = "loadings", loadings = 1:6, coeff = 1))

oneSNP <- which.max(abs(obj.svd$v[, 4]))
(plot_scoresA <- plot(obj.svd, type = "scores", scores = 4:5, coeff = 2) +
    aes(color = as.factor(attach.BM(G)[, ind.keep[oneSNP]])) + 
    labs(title = NULL, color = "Genotype") +
    theme(legend.position = c(0.15, 0.15)) +
    guides(color = guide_legend(override.aes = list(size = 4))))

Automatically removing long-range LD regions

system.time(
  test <- snp_autoSVD(G, CHR, POS, ncores = NCORES)
)
## Phase of clumping at r2 > 0.2.. keep 94517 SNPs.
## 
## Iteration 1:
## Computing SVD..
## 3 long-range LD regions were detected..
## 
## Iteration 2:
## Computing SVD..
## 2 long-range LD regions were detected..
## 
## Iteration 3:
## Computing SVD..
## 
## Converged!
##    user  system elapsed 
##  44.161   1.891 281.557
htmlTable::htmlTable(attr(test, "lrldr"), rnames = FALSE,
                     css.table = "margin-top: 1em; 
                                  margin-bottom: 1em;   
                                  margin-left: auto;
                                  margin-right: auto",
                     css.cell = "padding: 5px 5px 5px 5px")
Chr Start Stop
2 134374210 138130055
6 23788094 35793692
8 6341567 13526256
3 163071168 165026940
14 46690298 47500807
attr(test, "lrldr") %>%
  transmute(
    Chromosome = Chr,
    `Start (Mb)` = Start / 1e6,
    `Stop (Mb)` = Stop / 1e6
  ) %>%
  xtable::xtable(digits = 1, caption = "", label = "", auto = TRUE)
% latex table generated in R 3.3.3 by xtable 1.8-2 package
% Sun Jul 30 20:46:16 2017
\begin{table}[ht]
\centering
\begin{tabular}{lrrr}
  \hline
 & Chromosome & Start (Mb) & Stop (Mb) \\ 
  \hline
1 &  2 & 134.4 & 138.1 \\ 
  2 &  6 & 23.8 & 35.8 \\ 
  3 &  8 & 6.3 & 13.5 \\ 
  4 &  3 & 163.1 & 165.0 \\ 
  5 & 14 & 46.7 & 47.5 \\ 
   \hline
\end{tabular}
\caption{} 
\label{}
\end{table}
(plot_loadingsB <- plot(test, type = "loadings", loadings = 1:6, coeff = 1))

# Get population from external files
pop.files <- list.files(path = "../thesis-celiac/Dubois2010_data/",
                        pattern = "cluster_*",
                        full.names = TRUE)
pop <- snp_getSampleInfos(celiac, pop.files)[[1]]
pop.names <- c("Netherlands", "Italy", "UK1", "UK2", "Finland")

(plot_scoresB <- plot(test, type = "scores", scores = 4:5, coeff = 2) +
    aes(color = pop.names[pop]) + 
    labs(title = NULL, color = "Population") +
    theme(legend.position = c(0.2, 0.75)) +
    guides(color = guide_legend(override.aes = list(size = 4))))

# save plots for paper
p <- cowplot::plot_grid(plot_loadingsA, plot_loadingsB, align = "hv", 
                        ncol = 2, labels = c("A", "B"), label_size = 30)
ggsave("figures/loadings.png", width = 1620, height = 920, scale = 1/75)

p <- cowplot::plot_grid(plot_scoresA, plot_scoresB, align = "hv", 
                        ncol = 2, labels = c("A", "B"), label_size = 30)
ggsave("figures/scores.png", width = 1640, height = 750, scale = 1/75)
doParallel::registerDoParallel(NCORES)

nBoot <- 500

tribble(
  ~name,                                               ~svd,
  "When automatically removing long-range LD regions", test, 
  "When keeping long-range LD regions",                obj.svd
) %>%
  apply(1, function(method) {
    V <- method[["svd"]][["v"]]^2
    m <- nrow(V)
    foreach(ic = seq_len(nBoot), .combine = 'rbind') %dopar% {
      ind <- sort(sample(m, replace = TRUE))
      apply(V[ind, ], 2, ineq::Gini)
    } %>%
      reshape2::melt() %>%
      cbind(method = method[["name"]])
  }) %>%
  Reduce(f = 'rbind') %>%
  ggplot(aes(as.factor(Var2), value)) %>%
  bigstatsr:::MY_THEME(coeff = 2) +
  geom_hline(yintercept = 2 / pi, lty = 2) +
  geom_boxplot(aes(color = method), lwd = 1.5) +
  labs(y = "Gini coefficient of squared loadings", 
       x = "Principal Component", color = "") +
  theme(legend.position = "top") +
  guides(color = guide_legend(ncol = 1)) 

doParallel::stopImplicitCluster()

ggsave("figures/gini.png", width = 1341, height = 865, scale = 1/75)

Removing long-range LD regions based on predefined table

ind.excl0 <- snp_indLRLDR(CHR, POS)
ind.keep0 <- snp_clumping(G, CHR, exclude = ind.excl0, ncores = NCORES)
obj.svd0 <- big_randomSVD(G, snp_scaleBinom(), 
                          ind.col = ind.keep0,
                          ncores = NCORES)
corr <- cor(test$u, obj.svd0$u)
rownames(corr) <- colnames(corr) <- paste0("PC", 1:10)
htmlTable::htmlTable(round(100 * corr, 2), 
                     align = paste(c("|", rep("c", 10)), collapse = ""),
                     css.table = "margin-top: 1em; 
                                  margin-bottom: 1em;   
                                  margin-left: auto;
                                  margin-right: auto",
                     css.cell = "padding: 5px 5px 5px 5px")
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
PC1 99.99 0.07 0.05 0 0.01 -0.02 -0.01 -0.01 0 -0.02
PC2 -0.07 99.98 0.02 0.02 0.01 -0.03 0 -0.02 0 -0.04
PC3 -0.06 -0.02 99.88 0.19 -0.03 0.05 0.11 0.1 -0.04 -0.11
PC4 0 -0.01 -0.23 99.89 0.15 0.14 -0.07 0.03 -0.01 0.09
PC5 0 0 -0.01 -0.16 99.64 -0.29 0.34 -0.11 -0.93 0.67
PC6 0.02 0.04 -0.05 -0.18 0.27 99.62 0.5 -0.45 0.13 -0.47
PC7 0.02 0.04 -0.14 0.01 -0.53 -0.43 98.9 3.14 -0.85 1.96
PC8 0 -0.01 0.29 0.09 -0.24 -0.41 3.38 -97.96 -5.35 1.13
PC9 0 0.02 0.09 -0.02 0.69 0.01 0.98 -4.99 96.8 10.44
PC10 0.02 0 0.09 -0.05 -0.49 0.31 -1.99 -0.1 -8.69 91.72
mean(diag(abs(corr)))
## [1] 0.9843747